In [3]:
import pandas as pd
import numpy as np
In [4]:
products = pd.read_csv('amazon_baby_subset.csv')
In [3]:
products['name'][:10]
Out[3]:
In [5]:
print (products['sentiment'] == 1).sum()
print (products['sentiment'] == -1).sum()
print (products['sentiment']).count()
In [5]:
import json
with open('important_words.json') as important_words_file:
important_words = json.load(important_words_file)
print important_words[:3]
In [6]:
products = products.fillna({'review':''}) # fill in N/A's in the review column
def remove_punctuation(text):
import string
return text.translate(None, string.punctuation)
products['review_clean'] = products['review'].apply(remove_punctuation)
products.head(3)
Out[6]:
In [7]:
for word in important_words:
products[word] = products['review_clean'].apply(lambda s : s.split().count(word))
In [8]:
products.head(1)
Out[8]:
In [9]:
products['contains_perfect'] = products['perfect'] >=1
print products['contains_perfect'].sum()
The function should accept three parameters:
The function should return two values:
The function should do the following:
In [10]:
def get_numpy_data(dataframe, features, label):
dataframe['constant'] = 1
features = ['constant'] + features
features_frame = dataframe[features]
feature_matrix = features_frame.as_matrix()
label_sarray = dataframe[label]
label_array = label_sarray.as_matrix()
return(feature_matrix, label_array)
In [11]:
feature_matrix, sentiment = get_numpy_data(products, important_words, 'sentiment')
In [12]:
print feature_matrix.shape
In [19]:
'''
feature_matrix: N * D
coefficients: D * 1
predictions: N * 1
produces probablistic estimate for P(y_i = +1 | x_i, w).
estimate ranges between 0 and 1.
'''
def predict_probability(feature_matrix, coefficients):
# Take dot product of feature_matrix and coefficients
# YOUR CODE HERE
score = np.dot(feature_matrix, coefficients) # N * 1
# Compute P(y_i = +1 | x_i, w) using the link function
# YOUR CODE HERE
predictions = 1.0/(1+np.exp(-score))
# return predictions
return predictions
In [14]:
"""
errors: N * 1
feature: N * 1
derivative: 1
"""
def feature_derivative(errors, feature):
# Compute the dot product of errors and feature
derivative = np.dot(np.transpose(errors), feature)
# Return the derivative
return derivative
In [15]:
def compute_log_likelihood(feature_matrix, sentiment, coefficients):
indicator = (sentiment==+1)
scores = np.dot(feature_matrix, coefficients)
# scores.shape (53072L, 1L)
# indicator.shape (53072L,)
lp = np.sum((np.transpose(np.array([indicator]))-1)*scores - np.log(1. + np.exp(-scores)))
return lp
The function accepts the following parameters:
The function carries out the following steps:
In [35]:
# coefficients: D * 1
from math import sqrt
def logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter):
coefficients = np.array(initial_coefficients) # make sure it's a numpy array
# lplist = []
for itr in xrange(max_iter):
# Predict P(y_i = +1|x_1,w) using your predict_probability() function
# YOUR CODE HERE
predictions = predict_probability(feature_matrix, coefficients)
# Compute indicator value for (y_i = +1)
indicator = (sentiment==+1)
# Compute the errors as indicator - predictions
errors = np.transpose(np.array([indicator])) - predictions
for j in xrange(len(coefficients)): # loop over each coefficient
# Recall that feature_matrix[:,j] is the feature column associated with coefficients[j]
# compute the derivative for coefficients[j]. Save it in a variable called derivative
# YOUR CODE HERE
derivative = feature_derivative(errors, feature_matrix[:,j])
# add the step size times the derivative to the current coefficient
# YOUR CODE HERE
coefficients[j] += step_size*derivative
# Checking whether log likelihood is increasing
if itr <= 15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and itr % 100 == 0) \
or (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
# lplist.append(compute_log_likelihood(feature_matrix, sentiment, coefficients))
lp = compute_log_likelihood(feature_matrix, sentiment, coefficients)
print 'iteration %*d: log likelihood of observed labels = %.8f' % \
(int(np.ceil(np.log10(max_iter))), itr, lp)
"""
import matplotlib.pyplot as plt
x= [i for i in range(len(lplist))]
plt.plot(x,lplist,'ro')
plt.show()
"""
return coefficients
In [17]:
initial_coefficients = np.zeros((194,1))
step_size = 1e-7
max_iter = 301
In [20]:
coefficients = logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter)
In [34]:
coefficients = logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter)
In [36]:
"""
feature_matrix: N * D
coefficients: D * 1
predictions: N * 1
"""
predictions = predict_probability(feature_matrix, coefficients)
NumPositive = (predictions > 0.5).sum()
print NumPositive
score = np.dot(feature_matrix, coefficients) # N * 1
print (score > 0).sum()
In [22]:
print 0 in products['sentiment']
In [23]:
print -1 in products['sentiment']
In [24]:
print np.transpose(predictions.flatten()).shape
print (products['sentiment']).shape
In [25]:
print (np.transpose(predictions.flatten()))[:5]
In [46]:
correct_num = np.sum((np.transpose(predictions.flatten())> 0.5) == np.array(products['sentiment']>0))
total_num = len(products['sentiment'])
print "correct_num: {}, total_num: {}".format(correct_num, total_num)
accuracy = correct_num * 1./ total_num
print accuracy
In [39]:
np.transpose(predictions.flatten())> 0.5
Out[39]:
In [45]:
np.array(products['sentiment']>0)
Out[45]:
In [48]:
correct_num = np.sum((np.transpose(score.flatten())> 0) == np.array(products['sentiment']>0))
total_num = len(products['sentiment'])
print "correct_num: {}, total_num: {}".format(correct_num, total_num)
accuracy = correct_num * 1./ total_num
print accuracy
In [28]:
coefficients = list(coefficients[1:]) # exclude intercept
word_coefficient_tuples = [(word, coefficient) for word, coefficient in zip(important_words, coefficients)]
word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=True)
In [29]:
word_coefficient_tuples[:10]
Out[29]:
In [30]:
word_coefficient_tuples[-10:]
Out[30]:
In [31]:
print np.array([1,2,3])==np.array([1,3,2])
In [ ]: